<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
    <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
    <meta content="Apache Forrest" name="Generator">
    <meta name="Forrest-version" content="0.9">
    <meta name="Forrest-skin-name" content="pelt">
    <title>ZooKeeper</title>
    <link type="text/css" href="skin/basic.css" rel="stylesheet">
    <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
    <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
    <link type="text/css" href="skin/profile.css" rel="stylesheet">
    <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script>
    <script src="skin/getMenu.js" language="javascript" type="text/javascript"></script>
    <script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
    <link rel="shortcut icon" href="images/favicon.ico">
</head>
<body onload="init()">
<script type="text/javascript">ndeSetTextSize();</script>
<div id="top">
    <!--+
        |breadtrail
        +-->
    <div class="breadtrail">
        <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://zookeeper.apache.org/">ZooKeeper</a> &gt; <a
            href="http://zookeeper.apache.org/">ZooKeeper</a>
        <script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
    </div>
    <!--+
        |header
        +-->
    <div class="header">
        <!--+
         activetart group logo
            +-->
        <div class="grouplogo">
            <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg"
                                                     title="Apache Hadoop"></a>
        </div>
        <!--+
            |end group logo
            +-->
        <!--+
         activetart Project Logo
            +-->
        <div class="projectlogo">
            <a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper"
                                                        src="images/zookeeper_small.gif"
                                                        title="ZooKeeper: distributed coordination"></a>
        </div>
        <!--+
            |end Project Logo
            +-->
        <!--+
         activetart Search
            +-->
        <div class="searchbox">
            <form action="http://www.google.com/search" method="get" class="roundtopsmall">
                <input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input
                    onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text"
                    value="Search the site with google">&nbsp;
                <input name="Search" value="Search" type="submit">
            </form>
        </div>
        <!--+
            |end search
            +-->
        <!--+
         activetart Tabs
            +-->
        <ul id="tabs">
            <li>
                <a class="unselected" href="http://zookeeper.apache.org/">Project</a>
            </li>
            <li>
                <a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a>
            </li>
            <li class="current">
                <a class="selected" href="index.html">ZooKeeper 3.4 Documentation</a>
            </li>
        </ul>
        <!--+
            |end Tabs
            +-->
    </div>
</div>
<div id="main">
    <div id="publishedStrip">
        <!--+
         activetart Subtabs
            +-->
        <div id="level2tabs"></div>
        <!--+
            |end Endtabs
            +-->
        <script type="text/javascript"><!--
        document.write("Last Published: " + document.lastModified);
        //  --></script>
    </div>
    <!--+
        |breadtrail
        +-->
    <div class="breadtrail">

        &nbsp;
    </div>
    <!--+
     activetart Menu, mainarea
        +-->
    <!--+
     activetart Menu
        +-->
    <div id="menu">
        <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle"
             style="background-image: url('skin/images/chapter_open.gif');">Overview
        </div>
        <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
            <div class="menuitem">
                <a href="index.html">Welcome</a>
            </div>
            <div class="menupage">
                <div class="menupagetitle">Overview</div>
            </div>
            <div class="menuitem">
                <a href="zookeeperStarted.html">Getting Started</a>
            </div>
            <div class="menuitem">
                <a href="releasenotes.html">Release Notes</a>
            </div>
        </div>
        <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div>
        <div id="menu_1.2" class="menuitemgroup">
            <div class="menuitem">
                <a href="api/index.html">API Docs</a>
            </div>
            <div class="menuitem">
                <a href="zookeeperProgrammers.html">Programmer's Guide</a>
            </div>
            <div class="menuitem">
                <a href="javaExample.html">Java Example</a>
            </div>
            <div class="menuitem">
                <a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
            </div>
            <div class="menuitem">
                <a href="recipes.html">Recipes</a>
            </div>
        </div>
        <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">BookKeeper</div>
        <div id="menu_1.3" class="menuitemgroup">
            <div class="menuitem">
                <a href="bookkeeperStarted.html">Getting started</a>
            </div>
            <div class="menuitem">
                <a href="bookkeeperOverview.html">Overview</a>
            </div>
            <div class="menuitem">
                <a href="bookkeeperConfig.html">Setup guide</a>
            </div>
            <div class="menuitem">
                <a href="bookkeeperProgrammer.html">Programmer's guide</a>
            </div>
        </div>
        <div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Admin &amp; Ops</div>
        <div id="menu_1.4" class="menuitemgroup">
            <div class="menuitem">
                <a href="zookeeperAdmin.html">Administrator's Guide</a>
            </div>
            <div class="menuitem">
                <a href="zookeeperQuotas.html">Quota Guide</a>
            </div>
            <div class="menuitem">
                <a href="zookeeperJMX.html">JMX</a>
            </div>
            <div class="menuitem">
                <a href="zookeeperObservers.html">Observers Guide</a>
            </div>
        </div>
        <div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Contributor</div>
        <div id="menu_1.5" class="menuitemgroup">
            <div class="menuitem">
                <a href="zookeeperInternals.html">ZooKeeper Internals</a>
            </div>
        </div>
        <div onclick="SwitchMenu('menu_1.6', 'skin/')" id="menu_1.6Title" class="menutitle">Miscellaneous</div>
        <div id="menu_1.6" class="menuitemgroup">
            <div class="menuitem">
                <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a>
            </div>
            <div class="menuitem">
                <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>
            </div>
            <div class="menuitem">
                <a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a>
            </div>
        </div>
        <div id="credit"></div>
        <div id="roundbottom">
            <img style="display: none" class="corner" height="15" width="15" alt=""
                 src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
        <!--+
          |alternative credits
          +-->
        <div id="credit2"></div>
    </div>
    <!--+
        |end Menu
        +-->
    <!--+
     activetart content
        +-->
    <div id="content">
        <div title="Portable Document Format" class="pdflink">
            <a class="dida" href="zookeeperOver.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
                PDF</a>
        </div>
        <h1>ZooKeeper</h1>
        <div id="front-matter">
            <div id="minitoc-area">
                <ul class="minitoc">
                    <li>
                        <a href="#ch_DesignOverview">ZooKeeper: A Distributed Coordination Service for Distributed
                            Applications</a>
                        <ul class="minitoc">
                            <li>
                                <a href="#sc_designGoals">Design Goals</a>
                            </li>
                            <li>
                                <a href="#sc_dataModelNameSpace">Data model and the hierarchical namespace</a>
                            </li>
                            <li>
                                <a href="#Nodes+and+ephemeral+nodes">Nodes and ephemeral nodes</a>
                            </li>
                            <li>
                                <a href="#Conditional+updates+and+watches">Conditional updates and watches</a>
                            </li>
                            <li>
                                <a href="#Guarantees">Guarantees</a>
                            </li>
                            <li>
                                <a href="#Simple+API">Simple API</a>
                            </li>
                            <li>
                                <a href="#Implementation">Implementation</a>
                            </li>
                            <li>
                                <a href="#Uses">Uses</a>
                            </li>
                            <li>
                                <a href="#Performance">Performance</a>
                            </li>
                            <li>
                                <a href="#Reliability">Reliability</a>
                            </li>
                            <li>
                                <a href="#The+ZooKeeper+Project">The ZooKeeper Project</a>
                            </li>
                        </ul>
                    </li>
                </ul>
            </div>
        </div>


        <a name="ch_DesignOverview"></a>
        <h2 class="h3">ZooKeeper: A Distributed Coordination Service for Distributed
            Applications</h2>
        <div class="section">
            <p>ZooKeeper is a distributed, open-source coordination service for
                distributed applications. It exposes a simple set of primitives that
                distributed applications can build upon to implement higher level services
                for synchronization, configuration maintenance, and groups and naming. It
                is designed to be easy to program to, and uses a data model styled after
                the familiar directory tree structure of file systems. It runs in Java and
                has bindings for both Java and C.</p>
            <p>Coordination services are notoriously hard to get right. They are
                especially prone to errors such as race conditions and deadlock. The
                motivation behind ZooKeeper is to relieve distributed applications the
                responsibility of implementing coordination services from scratch.</p>
            <a name="sc_designGoals"></a>
            <h3 class="h4">Design Goals</h3>
            <p>
                <strong>ZooKeeper is simple.</strong> ZooKeeper
                allows distributed processes to coordinate with each other through a
                shared hierarchal namespace which is organized similarly to a standard
                file system. The name space consists of data registers - called znodes,
                in ZooKeeper parlance - and these are similar to files and directories.
                Unlike a typical file system, which is designed for storage, ZooKeeper
                data is kept in-memory, which means ZooKeeper can acheive high
                throughput and low latency numbers.</p>
            <p>The ZooKeeper implementation puts a premium on high performance,
                highly available, strictly ordered access. The performance aspects of
                ZooKeeper means it can be used in large, distributed systems. The
                reliability aspects keep it from being a single point of failure. The
                strict ordering means that sophisticated synchronization primitives can
                be implemented at the client.</p>
            <p>
                <strong>ZooKeeper is replicated.</strong> Like the
                distributed processes it coordinates, ZooKeeper itself is intended to be
                replicated over a sets of hosts called an ensemble.</p>
            <table class="ForrestTable" cellspacing="1" cellpadding="4">
                <tr>
                    <td>ZooKeeper Service</td>
                </tr>
                <tr>
                    <td>

                        <img alt="" src="images/zkservice.jpg">

                    </td>
                </tr>
            </table>
            <p>The taskProcessorIds that make up the ZooKeeper service must all know about
                each other. They maintain an in-memory image of state, along with a
                transaction logs and snapshots in a persistent store. As long as a
                majority of the taskProcessorIds are available, the ZooKeeper service will be
                available.</p>
            <p>Clients connect to a single ZooKeeper server. The client maintains
                a TCP connection through which it sends requests, gets responses, gets
                watch events, and sends heart beats. If the TCP connection to the server
                breaks, the client will connect to a different server.</p>
            <p>
                <strong>ZooKeeper is ordered.</strong> ZooKeeper
                stamps each update with a number that reflects the order of all
                ZooKeeper transactions. Subsequent operations can use the order to
                implement higher-level abstractions, such as synchronization
                primitives.</p>
            <p>
                <strong>ZooKeeper is fast.</strong> It is
                especially fast in "read-dominant" workloads. ZooKeeper applications run
                on thousands of machines, and it performs best where reads are more
                common than writes, at ratios of around 10:1.</p>
            <a name="sc_dataModelNameSpace"></a>
            <h3 class="h4">Data model and the hierarchical namespace</h3>
            <p>The name space provided by ZooKeeper is much like that of a
                standard file system. A name is a sequence of path elements separated by
                a slash (/). Every node in ZooKeeper's name space is identified by a
                path.</p>
            <table class="ForrestTable" cellspacing="1" cellpadding="4">
                <tr>
                    <td>ZooKeeper's Hierarchical Namespace</td>
                </tr>
                <tr>
                    <td>

                        <img alt="" src="images/zknamespace.jpg">

                    </td>
                </tr>
            </table>
            <a name="Nodes+and+ephemeral+nodes"></a>
            <h3 class="h4">Nodes and ephemeral nodes</h3>
            <p>Unlike is standard file systems, each node in a ZooKeeper
                namespace can have data associated with it as well as children. It is
                like having a file-system that allows a file to also be a directory.
                (ZooKeeper was designed to store coordination data: status information,
                configuration, location information, etc., so the data stored at each
                node is usually small, in the byte to kilobyte range.) We use the term
                <em>znode</em> to make it clear that we are talking about
                ZooKeeper data nodes.</p>
            <p>Znodes maintain a stat structure that includes version numbers for
                data changes, ACL changes, and timestamps, to allow cache validations
                and coordinated updates. Each time a znode's data changes, the version
                number increases. For instance, whenever a client retrieves data it also
                receives the version of the data.</p>
            <p>The data stored at each znode in a namespace is read and written
                atomically. Reads get all the data bytes associated with a znode and a
                write replaces all the data. Each node has an Access Control List (ACL)
                that restricts who can do what.</p>
            <p>ZooKeeper also has the notion of ephemeral nodes. These znodes
                exists as long as the session that created the znode is active. When the
                session ends the znode is deleted. Ephemeral nodes are useful when you
                want to implement <em>[tbd]</em>.</p>
            <a name="Conditional+updates+and+watches"></a>
            <h3 class="h4">Conditional updates and watches</h3>
            <p>ZooKeeper supports the concept of <em>watches</em>.
                Clients can set a watch on a znodes. A watch will be triggered and
                removed when the znode changes. When a watch is triggered the client
                receives a packet saying that the znode has changed. And if the
                connection between the client and one of the Zoo Keeper taskProcessorIds is
                broken, the client will receive a local notification. These can be used
                to <em>[tbd]</em>.</p>
            <a name="Guarantees"></a>
            <h3 class="h4">Guarantees</h3>
            <p>ZooKeeper is very fast and very simple. Since its goal, though, is
                to be a basis for the construction of more complicated services, such as
                synchronization, it provides a set of guarantees. These are:</p>
            <ul>

                <li>

                    <p>Sequential Consistency - Updates from a client will be applied
                        in the order that they were sent.</p>

                </li>


                <li>

                    <p>Atomicity - Updates either succeed or fail. No partial
                        results.</p>

                </li>


                <li>

                    <p>Single System Image - A client will see the same view of the
                        service regardless of the server that it connects to.</p>

                </li>

            </ul>
            <ul>

                <li>

                    <p>Reliability - Once an update has been applied, it will persist
                        from that time forward until a client overwrites the update.</p>

                </li>

            </ul>
            <ul>

                <li>

                    <p>Timeliness - The clients view of the system is guaranteed to
                        be up-to-date within a certain time bound.</p>

                </li>

            </ul>
            <p>For more information on these, and how they can be used, see
                <em>[tbd]</em>
            </p>
            <a name="Simple+API"></a>
            <h3 class="h4">Simple API</h3>
            <p>One of the design goals of ZooKeeper is provide a very simple
                programming interface. As a result, it supports only these
                operations:</p>
            <dl>

                <dt>
                    <term>create</term>
                </dt>
                <dd>
                    <p>creates a node at a location in the tree</p>
                </dd>


                <dt>
                    <term>delete</term>
                </dt>
                <dd>
                    <p>deletes a node</p>
                </dd>


                <dt>
                    <term>exists</term>
                </dt>
                <dd>
                    <p>tests if a node exists at a location</p>
                </dd>


                <dt>
                    <term>get data</term>
                </dt>
                <dd>
                    <p>reads the data from a node</p>
                </dd>


                <dt>
                    <term>set data</term>
                </dt>
                <dd>
                    <p>writes data to a node</p>
                </dd>


                <dt>
                    <term>get children</term>
                </dt>
                <dd>
                    <p>retrieves a list of children of a node</p>
                </dd>


                <dt>
                    <term>sync</term>
                </dt>
                <dd>
                    <p>waits for data to be propagated</p>
                </dd>

            </dl>
            <p>For a more in-depth discussion on these, and how they can be used
                to implement higher level operations, please refer to
                <em>[tbd]</em>
            </p>
            <a name="Implementation"></a>
            <h3 class="h4">Implementation</h3>
            <p>
                <a href="#fg_zkComponents">ZooKeeper Components</a> shows the high-level components
                of the ZooKeeper service. With the exception of the request taskDataItemProcessor,
                each of
                the taskProcessorIds that make up the ZooKeeper service replicates its own copy
                of each of components.</p>
            <table class="ForrestTable" cellspacing="1" cellpadding="4">
                <tr>
                    <td>ZooKeeper Components</td>
                </tr>
                <tr>
                    <td>

                        <img alt="" src="images/zkcomponents.jpg">

                    </td>
                </tr>
            </table>
            <p>The replicated database is an in-memory database containing the
                entire data tree. Updates are logged to disk for recoverability, and
                writes are serialized to disk before they are applied to the in-memory
                database.</p>
            <p>Every ZooKeeper server services clients. Clients connect to
                exactly one server to submit irequests. Read requests are serviced from
                the local replica of each server database. Requests that change the
                state of the service, write requests, are processed by an agreement
                protocol.</p>
            <p>As part of the agreement protocol all write requests from clients
                are forwarded to a single server, called the
                <em>leader</em>. The rest of the ZooKeeper taskProcessorIds, called
                <em>followers</em>, receive message proposals from the
                leader and agree upon message delivery. The messaging layer takes care
                of replacing leaders on failures and syncing followers with
                leaders.</p>
            <p>ZooKeeper uses a custom atomic messaging protocol. Since the
                messaging layer is atomic, ZooKeeper can guarantee that the local
                replicas never diverge. When the leader receives a write request, it
                calculates what the state of the system is when the write is to be
                applied and transforms this into a transaction that captures this new
                state.</p>
            <a name="Uses"></a>
            <h3 class="h4">Uses</h3>
            <p>The programming interface to ZooKeeper is deliberately simple.
                With it, however, you can implement higher order operations, such as
                synchronizations primitives, group membership, ownership, etc. Some
                distributed applications have used it to: <em>[tbd: add uses from
                    white paper and video presentation.]</em> For more information, see
                <em>[tbd]</em>
            </p>
            <a name="Performance"></a>
            <h3 class="h4">Performance</h3>
            <p>ZooKeeper is designed to be highly performant. But is it? The
                results of the ZooKeeper's development team at Yahoo! Research indicate
                that it is. (See <a href="#fg_zkPerfRW">ZooKeeper Throughput as the Read-Write Ratio Varies</a>.) It is
                especially high
                performance in applications where reads outnumber writes, since writes
                involve synchronizing the state of all taskProcessorIds. (Reads outnumbering
                writes is typically the case for a coordination service.)</p>
            <table class="ForrestTable" cellspacing="1" cellpadding="4">
                <tr>
                    <td>ZooKeeper Throughput as the Read-Write Ratio Varies</td>
                </tr>
                <tr>
                    <td>

                        <img alt="" src="images/zkperfRW-3.2.jpg">

                    </td>
                </tr>
            </table>
            <p>The figure <a href="#fg_zkPerfRW">ZooKeeper Throughput as the Read-Write Ratio Varies</a> is a throughput
                graph of ZooKeeper release 3.2 running on taskProcessorIds with dual 2Ghz
                Xeon and two SATA 15K RPM drives. One drive was used as a
                dedicated ZooKeeper log device. The snapshots were written to
                the OS drive. Write requests were 1K writes and the reads were
                1K reads. "Servers" indicate the size of the ZooKeeper
                ensemble, the number of taskProcessorIds that make up the
                service. Approximately 30 other taskProcessorIds were used to simulate
                the clients. The ZooKeeper ensemble was configured such that
                leaders do not allow connections from clients.</p>
            <div class="note">
                <div class="label">Note</div>
                <div class="content">
                    <p>In version 3.2 r/w performance improved by ~2x
                        compared to the <a
                                href="http://zookeeper.apache.org/docs/r3.1.1/zookeeperOver.html#Performance">previous
                            3.1 release</a>.</p>
                </div>
            </div>
            <p>Benchmarks also indicate that it is reliable, too. <a href="#fg_zkPerfReliability">Reliability in the
                Presence of Errors</a> shows how a deployment responds to
                various failures. The events marked in the figure are the
                following:</p>
            <ol>

                <li>

                    <p>Failure and recovery of a follower</p>

                </li>


                <li>

                    <p>Failure and recovery of a different follower</p>

                </li>


                <li>

                    <p>Failure of the leader</p>

                </li>


                <li>

                    <p>Failure and recovery of two followers</p>

                </li>


                <li>

                    <p>Failure of another leader</p>

                </li>

            </ol>
            <a name="Reliability"></a>
            <h3 class="h4">Reliability</h3>
            <p>To show the behavior of the system over time as
                failures are injected we ran a ZooKeeper service made up of
                7 machines. We ran the same saturation benchmark as before,
                but this time we kept the write percentage at a constant
                30%, which is a conservative ratio of our expected
                workloads.
            </p>
            <table class="ForrestTable" cellspacing="1" cellpadding="4">
                <tr>
                    <td>Reliability in the Presence of Errors</td>
                </tr>
                <tr>
                    <td>

                        <img alt="" src="images/zkperfreliability.jpg">

                    </td>
                </tr>
            </table>
            <p>The are a few important observations from this graph. First, if
                followers fail and recover quickly, then ZooKeeper is able to sustain a
                high throughput despite the failure. But maybe more importantly, the
                leader election algorithm allows for the system to recover fast enough
                to prevent throughput from dropping substantially. In our observations,
                ZooKeeper takes less than 200ms to elect a new leader. Third, as
                followers recover, ZooKeeper is able to raise throughput again once they
                active processing requests.</p>
            <a name="The+ZooKeeper+Project"></a>
            <h3 class="h4">The ZooKeeper Project</h3>
            <p>ZooKeeper has been
                <a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/PoweredBy">
                    successfully used
                </a>
                in many industrial applications. It is used at Yahoo! as the
                coordination and failure recovery service for Yahoo! Message
                Broker, which is a highly scalable publish-subscribe system
                managing thousands of topics for replication and data
                delivery. It is used by the Fetching Service for Yahoo!
                crawler, where it also manages failure recovery. A number of
                Yahoo! advertising systems also use ZooKeeper to implement
                reliable services.
            </p>
            <p>All users and developers are encouraged to join the
                community and contribute their expertise. See the
                <a href="http://zookeeper.apache.org/">
                    Zookeeper Project on Apache
                </a>
                for more information.
            </p>
        </div>

        <p align="right">
            <font size="-2"></font>
        </p>
    </div>
    <!--+
        |end content
        +-->
    <div class="clearboth">&nbsp;</div>
</div>
<div id="footer">
    <!--+
     activetart bottomstrip
        +-->
    <div class="lastmodified">
        <script type="text/javascript"><!--
        document.write("Last Published: " + document.lastModified);
        //  --></script>
    </div>
    <div class="copyright">
        Copyright &copy;
        2008-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
    </div>
    <!--+
        |end bottomstrip
        +-->
</div>
</body>
</html>
